library(aplore3)
library(ggplot2)
library(dplyr)
library(plotly)
library(ca)
library(RColorBrewer)
library(pheatmap)
library(cluster)
library(ggcorrplot)
library(dplyr)
library(tidyr)
library(sjPlot)
library(sjmisc)
library(caret)
library(vcd)
library(FactoMineR)
#??glow500

Data Format: A data.frame with 500 rows and 18 variables such as:

priorfrac - If the patient previously had a fracture
age - Age at Enrollment (years)
weight - Weight at enrollment (Kilograms)
height - Height at enrollment (Centimeters)
bmi - Body Mass Index (Kg/m^2)
premeno - Menopause before age 45 (1: No, 2: Yes)
momfrac - Mother had hip fracture (1: No, 2: Yes)
armassist - Arms are needed to stand from a chair (1: No, 2: Yes)
smoke - Former or current smoker (1: No, 2: Yes)
raterisk - Self-reported risk of fracture (1: Less than others of the same age, 2: Same as others of the same age, 3: Greater than others of the same age
fracscore - Fracture Risk Score (Composite Risk Score)
fracture - Any fracture in first year (1: No, 2: Yes)
bonemed - Bone medications at enrollment (1: No, 2: Yes)
bonemed_fu - Bone medications at follow-up (1: No, 2: Yes)
bonetreat - Bone medications both at enrollment and follow-up (1: No, 2: Yes)

head(glow_bonemed)
##   sub_id site_id phy_id priorfrac age weight height      bmi premeno momfrac
## 1      1       1     14        No  62   70.3    158 28.16055      No      No
## 2      2       4    284        No  65   87.1    160 34.02344      No      No
## 3      3       6    305       Yes  88   50.8    157 20.60936      No     Yes
## 4      4       6    309        No  82   62.1    160 24.25781      No      No
## 5      5       1     37        No  61   68.0    152 29.43213      No      No
## 6      6       5    299       Yes  67   68.0    161 26.23356      No      No
##   armassist smoke raterisk fracscore fracture bonemed bonemed_fu bonetreat
## 1        No    No     Same         1       No      No         No        No
## 2        No    No     Same         2       No      No         No        No
## 3       Yes    No     Less        11       No      No         No        No
## 4        No    No     Less         5       No      No         No        No
## 5        No    No     Same         1       No      No         No        No
## 6        No   Yes     Same         4       No      No         No        No

Summary statistics for numeric variables

#Set fracscore to integer for summary statistics
glow_bonemed$fracscore = as.integer(glow_bonemed$fracscore)
mysummary = glow_bonemed %>%
  select(age, weight, height, bmi, fracscore) %>%
  summarise_each(
    funs(min = min, 
    q25 = quantile(., 0.25), 
    median = median, 
    q75 = quantile(., 0.75), 
    max = max,
    mean = mean, 
    sd = sd,
    variance= var))

# reshape it using tidyr functions

clean.summary = mysummary %>% 
  gather(stat, val) %>%
  separate(stat, into = c("var", "stat"), sep = "_") %>%
  spread(stat, val) %>%
  select(var, min, max, mean, sd, variance)

print(clean.summary)
##         var       min       max      mean        sd   variance
## 1       age  55.00000  90.00000  68.56200  8.989537  80.811780
## 2       bmi  14.87637  49.08241  27.55303  5.973958  35.688178
## 3 fracscore   0.00000  11.00000   3.69800  2.495446   6.227251
## 4    height 134.00000 199.00000 161.36400  6.355493  40.392289
## 5    weight  39.90000 127.00000  71.82320 16.435992 270.141825

Summary statistics for categorical variables

summary(glow_bonemed %>% select(priorfrac, premeno, momfrac, armassist, smoke, raterisk, fracture, bonemed, bonemed_fu, bonetreat))
##  priorfrac premeno   momfrac   armassist smoke        raterisk   fracture 
##  No :374   No :403   No :435   No :312   No :465   Less   :167   No :375  
##  Yes:126   Yes: 97   Yes: 65   Yes:188   Yes: 35   Same   :186   Yes:125  
##                                                    Greater:147            
##  bonemed   bonemed_fu bonetreat
##  No :371   No :361    No :382  
##  Yes:129   Yes:139    Yes:118  
## 

No missing values

colSums(is.na(glow_bonemed))
##     sub_id    site_id     phy_id  priorfrac        age     weight     height 
##          0          0          0          0          0          0          0 
##        bmi    premeno    momfrac  armassist      smoke   raterisk  fracscore 
##          0          0          0          0          0          0          0 
##   fracture    bonemed bonemed_fu  bonetreat 
##          0          0          0          0
sum(is.na(glow_bonemed))
## [1] 0

Age vs Weight: As weight increases the average age decreases
Age vs Height: Weak correlation of as height increases age decreases
Age vs BMI: As bmi increases the average age decreases
Age vs fracscore: As age increases the average fracscore increases

Weight vs Height: As height increases the average weight increases
Weight vs BMI: As bmi increases the average weight increases
Weight vs fracscore: As fracscore increases the average Weight decreases

Height vs BMI: As bmi increases the average height and variance stay the same
Height vs fracscore: As fracscore increases the average height stays the same though variance might decrease

BMI vs fracscore: As fracscore increases the average bmi decreases

plot(glow_bonemed[, c(5:8, 14)])

Non of the following scatter plots show strong groupings for the fracture/no fracture categorical variable

ggplot(glow_bonemed, aes(x = age, y = bmi, color = fracture)) +
  geom_jitter() +
  labs(title = "BMI vs Age")

ggplot(glow_bonemed, aes(x = bmi, y = fracscore, color = fracture)) +
  geom_jitter() +
  labs(title = "Fracture Score vs BMI")

ggplot(glow_bonemed, aes(x = fracscore, y = age, color = fracture)) +
  geom_jitter() +
  labs(title = "Age vs Fracture Score")

ggplot(glow_bonemed, aes(x = weight, y = height, color = fracture)) +
  geom_jitter() +
  labs(title = "Height vs Weight")

Once again there doesn’t seem to be strong groupings of the fracture categorical variable

fracture3dplot = plot_ly(glow_bonemed, 
  x = ~age, 
  y = ~height, 
  z = ~bmi, 
  color = ~fracture, 
  colors = c('#0C4B8E', '#BF382A')) %>% add_markers()

fracture3dplot

There are so little “yes” fracture results that the plot isn’t very useful

## `geom_smooth()` using formula = 'y ~ x'

The boxplot shows that the mean fracscore seems to be slightly higher for smokers compared to non smokers

ggplot(glow_bonemed, aes(x = smoke, y = fracscore)) +
  geom_boxplot() +
  labs(title = "Fracture Score Summary Statistics for Smokers vs Non Smokers")

Plot confirms there is a strong correlation between age/fracscore, bmi/weight

corr_vars <- c("age", "weight", "height", "bmi", "fracscore")

corr_df <- glow_bonemed[, corr_vars]
corr_df <- cor(corr_df)

ggcorrplot(corr = corr_df, lab = TRUE, lab_size = 2,
  colors = c("#6D9EC1", "white", "#E46726")) +
  labs(title = "Correlation Between Variables") +
  theme(plot.title = element_text(hjust = .5),
  plot.subtitle = element_text(hjust = .5))

Categorical Variable EDA

mosaicplot(table(glow_bonemed$priorfrac, glow_bonemed$fracture), color = TRUE)

chisq.test(table(glow_bonemed$priorfrac, glow_bonemed$fracture))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(glow_bonemed$priorfrac, glow_bonemed$fracture)
## X-squared = 22.635, df = 1, p-value = 1.959e-06

Perform chi-squared test on all categorical variables

glow_bonemed$fracscore = as.factor(glow_bonemed$fracscore)

blow_bonemed_categoricals = glow_bonemed[, c(4, 9:18)]

multipleCorrespondenceAnalysis = MCA(blow_bonemed_categoricals, quali.sup = "fracture", graph = FALSE)
multipleCorrespondenceAnalysis
## **Results of the Multiple Correspondence Analysis (MCA)**
## The analysis was performed on 500 individuals, described by 11 variables
## *The results are available in the following objects:
## 
##    name                description                                          
## 1  "$eig"              "eigenvalues"                                        
## 2  "$var"              "results for the variables"                          
## 3  "$var$coord"        "coord. of the categories"                           
## 4  "$var$cos2"         "cos2 for the categories"                            
## 5  "$var$contrib"      "contributions of the categories"                    
## 6  "$var$v.test"       "v-test for the categories"                          
## 7  "$ind"              "results for the individuals"                        
## 8  "$ind$coord"        "coord. for the individuals"                         
## 9  "$ind$cos2"         "cos2 for the individuals"                           
## 10 "$ind$contrib"      "contributions of the individuals"                   
## 11 "$quali.sup"        "results for the supplementary categorical variables"
## 12 "$quali.sup$coord"  "coord. for the supplementary categories"            
## 13 "$quali.sup$cos2"   "cos2 for the supplementary categories"              
## 14 "$quali.sup$v.test" "v-test for the supplementary categories"            
## 15 "$call"             "intermediate results"                               
## 16 "$call$marge.col"   "weights of columns"                                 
## 17 "$call$marge.li"    "weights of rows"

All of the categorical variables seem to have significant P values with bonetreate, bonemed, and bonemed_fu all having R^2 above .83

firstDimension = dimdesc(multipleCorrespondenceAnalysis, axes = 1)
firstDimension
## $`Dim 1`
## 
## Link between the variable and the categorical variable (1-way anova)
## =============================================
##                    R2       p.value
## bonetreat  0.87245886 7.754159e-225
## bonemed    0.84726742 2.443029e-205
## bonemed_fu 0.83803940 5.422994e-199
## raterisk   0.19571783  3.118876e-24
## fracscore  0.19131230  2.223696e-17
## priorfrac  0.10838142  4.201009e-14
## fracture   0.04922945  5.400633e-07
## armassist  0.02280481  7.047091e-04
## premeno    0.01718799  3.315107e-03
## smoke      0.01441500  7.194873e-03
## 
## Link between variable and the categories of the categorical variables
## ================================================================
##                              Estimate       p.value
## bonetreat=bonetreat_Yes    0.61370510 7.754159e-225
## bonemed=bonemed_Yes        0.58693281 2.443029e-205
## bonemed_fu=bonemed_fu_Yes  0.57007390 5.422994e-199
## raterisk=Greater           0.32172629  2.159398e-19
## priorfrac=priorfrac_Yes    0.21155159  4.201009e-14
## fracscore=10               1.42877661  4.142645e-07
## fracture=fracture_Yes      0.14295580  5.400633e-07
## fracscore=9                0.47787785  1.527510e-04
## fracscore=8                0.21858281  3.120054e-04
## armassist=armassist_Yes    0.08697950  7.047091e-04
## fracscore=7                0.09452369  8.158244e-04
## premeno=premeno_No         0.09249838  3.315107e-03
## smoke=smoke_No             0.13128246  7.194873e-03
## fracscore=5               -0.05350173  4.364518e-02
## smoke=smoke_Yes           -0.13128246  7.194873e-03
## premeno=premeno_Yes       -0.09249838  3.315107e-03
## fracscore=1               -0.40626672  1.010225e-03
## armassist=armassist_No    -0.08697950  7.047091e-04
## fracscore=0               -0.49480779  3.625746e-05
## fracture=fracture_No      -0.14295580  5.400633e-07
## priorfrac=priorfrac_No    -0.21155159  4.201009e-14
## raterisk=Less             -0.30243929  2.386909e-17
## bonemed_fu=bonemed_fu_No  -0.57007390 5.422994e-199
## bonemed=bonemed_No        -0.58693281 2.443029e-205
## bonetreat=bonetreat_No    -0.61370510 7.754159e-225

bonetreat, bonemed, bonemed_fu all have a sensitivity around 78% and specificity between 32% and 42%

confusionMatrix(table(glow_bonemed$bonetreat, glow_bonemed$fracture)) 
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  297  85
##   Yes  78  40
##                                         
##                Accuracy : 0.674         
##                  95% CI : (0.631, 0.715)
##     No Information Rate : 0.75          
##     P-Value [Acc > NIR] : 0.9999        
##                                         
##                   Kappa : 0.1141        
##                                         
##  Mcnemar's Test P-Value : 0.6384        
##                                         
##             Sensitivity : 0.7920        
##             Specificity : 0.3200        
##          Pos Pred Value : 0.7775        
##          Neg Pred Value : 0.3390        
##              Prevalence : 0.7500        
##          Detection Rate : 0.5940        
##    Detection Prevalence : 0.7640        
##       Balanced Accuracy : 0.5560        
##                                         
##        'Positive' Class : No            
## 
confusionMatrix(table(glow_bonemed$bonemed, glow_bonemed$fracture))
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  292  79
##   Yes  83  46
##                                          
##                Accuracy : 0.676          
##                  95% CI : (0.633, 0.7169)
##     No Information Rate : 0.75           
##     P-Value [Acc > NIR] : 0.9999         
##                                          
##                   Kappa : 0.1451         
##                                          
##  Mcnemar's Test P-Value : 0.8137         
##                                          
##             Sensitivity : 0.7787         
##             Specificity : 0.3680         
##          Pos Pred Value : 0.7871         
##          Neg Pred Value : 0.3566         
##              Prevalence : 0.7500         
##          Detection Rate : 0.5840         
##    Detection Prevalence : 0.7420         
##       Balanced Accuracy : 0.5733         
##                                          
##        'Positive' Class : No             
## 
confusionMatrix(table(glow_bonemed$bonemed_fu, glow_bonemed$fracture))
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  289  72
##   Yes  86  53
##                                           
##                Accuracy : 0.684           
##                  95% CI : (0.6412, 0.7246)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.9996          
##                                           
##                   Kappa : 0.1877          
##                                           
##  Mcnemar's Test P-Value : 0.3010          
##                                           
##             Sensitivity : 0.7707          
##             Specificity : 0.4240          
##          Pos Pred Value : 0.8006          
##          Neg Pred Value : 0.3813          
##              Prevalence : 0.7500          
##          Detection Rate : 0.5780          
##    Detection Prevalence : 0.7220          
##       Balanced Accuracy : 0.5973          
##                                           
##        'Positive' Class : No              
## 
glow_bonemed$fracscore = as.factor(glow_bonemed$fracscore)

blow_bonemed_categoricals = glow_bonemed[, c(15:18)]

multipleCorrespondenceAnalysis = MCA(blow_bonemed_categoricals, graph = FALSE)

plot.MCA(multipleCorrespondenceAnalysis, 
         cex = 0.8, 
         col.quali.sup = c("red", "blue", "green"), 
         title = "Multiple Correspondence Analysis")

Clustering EDA

pheatmap(glow_bonemed[, c(5,8)], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))

pheatmap(glow_bonemed[, 5:8], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))

zScoreScale = scale(glow_bonemed[, 5:8])

zScoreDistance = dist(zScoreScale)

continuousVariableClustering = hclust(zScoreDistance, method = "complete")

plot(continuousVariableClustering)

Modeling

Split the data into a training/testing set

set.seed(4)

trainingIndices = sample(c(1:dim(glow_bonemed)[1]), dim(glow_bonemed)[1]*0.8)

trainingDataframe = glow_bonemed[trainingIndices,]
testingDataframe = glow_bonemed[-trainingIndices,]

Age is the only statistically significant continuous variable at the alpha = 0.2 level (p < 0.0001)

model = glm(fracture ~ age + weight + height + bmi, data = glow_bonemed[trainingIndices,], family = "binomial")

summary(model)
## 
## Call:
## glm(formula = fracture ~ age + weight + height + bmi, family = "binomial", 
##     data = glow_bonemed[trainingIndices, ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2770  -0.7951  -0.6672   1.2602   1.9710  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.006590  13.891104  -0.432 0.665447    
## age          0.045888   0.013481   3.404 0.000664 ***
## weight      -0.039155   0.095610  -0.410 0.682156    
## height       0.008938   0.085578   0.104 0.916814    
## bmi          0.113632   0.247382   0.459 0.645993    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 458.45  on 399  degrees of freedom
## Residual deviance: 442.53  on 395  degrees of freedom
## AIC: 452.53
## 
## Number of Fisher Scoring iterations: 4
AIC(model)
## [1] 452.5326
# trying to figure out how to use sjPlot to mimic what we did in unit12 prelive
#plot_model(model, type = "pred", terms = c("age", "smoke"))

Get eigen vectors and values for principle component analysis

glow_bonemed$fracscore = as.integer(glow_bonemed$fracscore)
corr_vars <- c("age", "weight", "height", "bmi", "fracscore")
pc.result<-prcomp(glow_bonemed[, corr_vars],scale.=TRUE)

#Eigen Vectors
pc.result$rotation
##                  PC1         PC2         PC3         PC4          PC5
## age        0.4947219  0.46742140 -0.15246583  0.71654567 -0.009160237
## weight    -0.5273035  0.46578775 -0.08840991  0.03240244 -0.704362523
## height    -0.2345770 -0.08196149 -0.93823245  0.01885633  0.240042129
## bmi       -0.4741030  0.51615173  0.24533677  0.05137380  0.667820563
## fracscore  0.4442985  0.53984137 -0.16872342 -0.69463484  0.013601399
#Eigen Values
eigenvals<-pc.result$sdev^2
eigenvals
## [1] 2.374116591 1.523507236 0.975710317 0.123469514 0.003196342

Scree plot

par(mfrow = c(1, 2))

plot(eigenvals,type = "l",
     main = "Scree Plot",
     ylab = "Eigen Values",
     xlab = "PC #")

plot(eigenvals / sum(eigenvals), 
     type = "l", main = "Scree Plot", 
     ylab = "Prop. Var. Explained", 
     xlab = "PC #", 
     ylim = c(0, 1))

cumulative.prop = cumsum(eigenvals / sum(eigenvals))

lines(cumulative.prop, lty = 2)

Naive Bayes model with categorical variables

corr_vars <- c("priorfrac", "premeno", "momfrac", "armassist", "smoke", "raterisk", "fracture", "bonemed", "bonemed_fu", "bonetreat")

set.seed(1234)
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1)
nb.fit = train(fracture ~ .,
               data = trainingDataframe[, corr_vars],
               method = "nb",
               trControl = fitControl
               )
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
nb.fit
## Naive Bayes 
## 
## 400 samples
##   9 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 359, 361, 360, 360, 359, 361, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa   
##   FALSE      0.6731191  0.163332
##    TRUE      0.7400891  0.000000
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and adjust
##  = 1.

Knn models

corr_vars <- c("age", "weight", "height", "bmi", "fracscore", "fracture")

set.seed(1234)
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1)
knn.fit = train(fracture ~ .,
               data = trainingDataframe[, corr_vars],
               method = "knn",
               trControl = fitControl,
               tuneGrid = expand.grid(k = c(1:10, 20, 30))
               )
knn.fit
## k-Nearest Neighbors 
## 
## 400 samples
##   5 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 359, 361, 360, 360, 359, 361, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa      
##    1  0.6793168   0.15146923
##    2  0.6643074   0.09772090
##    3  0.6869418   0.07266771
##    4  0.6624969   0.01579128
##    5  0.6802533  -0.01485883
##    6  0.6999484   0.02039872
##    7  0.7047624  -0.01348597
##    8  0.7000094  -0.02251761
##    9  0.7149515  -0.01036712
##   10  0.7298358   0.07106741
##   20  0.7350829   0.02719129
##   30  0.7400891   0.00000000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 30.

After tinkering it seems the best model only uses the age variable

corr_vars <- c("age", "weight", "height", "bmi", "fracscore", "fracture")

set.seed(1234)
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1)
knn.fit = train(fracture ~ age,
               data = trainingDataframe[, corr_vars],
               method = "knn",
               trControl = fitControl,
               tuneGrid = expand.grid(k = c(1:10, 20, 30))
               )
knn.fit
## k-Nearest Neighbors 
## 
## 400 samples
##   1 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 359, 361, 360, 360, 359, 361, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa      
##    1  0.7178299   0.04451770
##    2  0.7252048   0.04095402
##    3  0.7302720   0.05828196
##    4  0.7202689   0.03291690
##    5  0.7327720   0.05605753
##    6  0.7328971   0.06388898
##    7  0.7454644   0.08867516
##    8  0.7428361   0.07440775
##    9  0.7454644   0.07173952
##   10  0.7276438  -0.01393495
##   20  0.7299578   0.01388032
##   30  0.7302079  -0.00185982
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

#Sources:
Hosmer, D.W., Lemeshow, S. and Sturdivant, R.X. (2013) Applied Logistic Regression, 3rd ed., New York: Wiley

https://cran.r-project.org/web/packages/aplore3/aplore3.pdf#page=11&zoom=100,132,90